#问题：如果样本表达值0比较多的话，应该进行排除，但是无从着手
#差异分析####
library("DESeq2")
setwd("D:\\R_DATA")#设置工作路径
rna_data<-read.csv("mRNA_exprSet.csv",header = TRUE,row.names = 1,sep = ',')#读取csv文件，行列名为第一行第一列，分隔符为，
ID<-sub(".*([0-9]{2})$", "\\1", colnames(rna_data))#使用sub函数提取列名中的最后两个数字，以便确定分组信息

#创建样本信息集
sample <- data.frame(
  sampleID = ID,
  condition = ifelse(ID == "01", "Tumor", "Normal")
)

# 创建DESeq2数据集对象
dds <- DESeqDataSetFromMatrix(countData = rna_data,
                              colData = sample,
                              design = ~ condition)

# 运行DESeq2分析，获取分析结果
dds <- DESeq(dds)
res <- results(dds)
print(res)

#排序，筛选差异表达显著基因
res_ordered<- res[order(res$pvalue),]
head(res_ordered)

#火山图####
library("ggplot2")
res$category <- "not significant"
res$category[res$pvalue < 0.05/(19632*123) & res$log2FoldChange > 1.2] <- "upregulated"
res$category[res$pvalue < 0.05/(19632*123) & res$log2FoldChange < -1.2] <- "downregulated"

volcano_plot <- ggplot(res, aes(x=log2FoldChange, y=-log10(pvalue), color=category)) +
  geom_point(alpha=0.5) +  # 点的透明度设置为0.5
  geom_vline(xintercept = 1.2, linetype="dashed", color="black") +  # 上调分隔线
  geom_vline(xintercept = -1.2, linetype="dashed", color="black") +  # 下调分隔线
  geom_hline(yintercept = -log10(0.05/19632/123), linetype="dashed", color="black") + # 添加P值显著性水平线
  scale_color_manual(values=c("not significant"="grey", "upregulated"="red", "downregulated"="blue")) + #各类规定颜色
  labs(x="Log2 Fold Change", y="-Log10 P-value", title="Volcano Plot") + #设计图标签
  theme_minimal()
print(volcano_plot)
